This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter (in Windows press CTRL+Enter).

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I (or CTRL+Alt+I in Windows).

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.

Package Management Tool

Here i am uploading my packages (code omitted).

Cross section – data

Statistics

  names(wage1)
##  [1] "wage"           "educ"           "exper"          "tenure"        
##  [5] "nonwhite"       "female"         "married"        "numdep"        
##  [9] "smsa"           "northcen"       "south"          "west"          
## [13] "construc"       "ndurman"        "trcommpu"       "trade"         
## [17] "services"       "profserv"       "profocc"        "clerocc"       
## [21] "servocc"        "lwage"          "expersq"        "tenursq"       
## [25] "female_married" "exper_tenure"
  head(wage1) # first 6 observations
  str(wage1) # structure of dataset
## 'data.frame':    526 obs. of  26 variables:
##  $ wage          : num  3.1 3.24 3 6 5.3 ...
##   ..- attr(*, "label")= chr "log(wage)"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ educ          : num  11 12 11 8 12 16 18 12 12 17 ...
##   ..- attr(*, "label")= chr "years of education"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ exper         : num  2 22 2 44 7 9 15 5 26 22 ...
##   ..- attr(*, "label")= chr "years potential experience"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ tenure        : num  0 2 0 28 2 8 7 3 4 21 ...
##   ..- attr(*, "label")= chr "years with current employer"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ nonwhite      : num  0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "=1 if nonwhite"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ female        : num  1 1 0 0 0 0 0 1 1 0 ...
##   ..- attr(*, "label")= chr "=1 if female"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ married       : num  0 1 0 1 1 1 0 0 0 1 ...
##   ..- attr(*, "label")= chr "=1 if married"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ numdep        : num  2 3 2 0 1 0 0 0 2 0 ...
##   ..- attr(*, "label")= chr "number of dependents"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ smsa          : num  1 1 0 1 0 1 1 1 1 1 ...
##   ..- attr(*, "label")= chr "=1 if live in SMSA"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ northcen      : num  0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "=1 if live in north central U.S"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ south         : num  0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "=1 if live in southern region"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ west          : num  1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "label")= chr "=1 if live in western region"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ construc      : num  0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "=1 if work in construc. indus."
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ ndurman       : num  0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "=1 if in nondur. manuf. indus."
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ trcommpu      : num  0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "=1 if in trans, commun, pub ut"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ trade         : num  0 0 1 0 0 0 1 0 1 0 ...
##   ..- attr(*, "label")= chr "=1 if in wholesale or retail"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ services      : num  0 1 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "=1 if in services indus."
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ profserv      : num  0 0 0 0 0 1 0 0 0 0 ...
##   ..- attr(*, "label")= chr "=1 if in prof. serv. indus."
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ profocc       : num  0 0 0 0 0 1 1 1 1 1 ...
##   ..- attr(*, "label")= chr "=1 if in profess. occupation"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ clerocc       : num  0 0 0 1 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "=1 if in clerical occupation"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ servocc       : num  0 1 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "=1 if in service occupation"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ lwage         : num  1.13 1.18 1.1 1.79 1.67 ...
##   ..- attr(*, "label")= chr "log(wage)"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ expersq       : num  4 484 4 1936 49 ...
##   ..- attr(*, "label")= chr "exper^2"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ tenursq       : num  0 4 0 784 4 64 49 9 16 441 ...
##   ..- attr(*, "label")= chr "tenure^2"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ female_married: num  0 1 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "=1 if female"
##   ..- attr(*, "format.stata")= chr "%8.0g"
##  $ exper_tenure  : num  0 44 0 1232 14 ...
##   ..- attr(*, "label")= chr "years potential experience"
##   ..- attr(*, "format.stata")= chr "%8.0g"
  ExpData(wage1,type=1)
  ExpData(wage1,type=2)
  dplyr::glimpse(wage1)
## Rows: 526
## Columns: 26
## $ wage           <dbl> 3.100000, 3.240000, 3.000000, 6.000000, 5.300000, 8.750…
## $ educ           <dbl> 11, 12, 11, 8, 12, 16, 18, 12, 12, 17, 16, 13, 12, 12, …
## $ exper          <dbl> 2, 22, 2, 44, 7, 9, 15, 5, 26, 22, 8, 3, 15, 18, 31, 14…
## $ tenure         <dbl> 0, 2, 0, 28, 2, 8, 7, 3, 4, 21, 2, 0, 0, 3, 15, 0, 0, 1…
## $ nonwhite       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ female         <dbl> 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1…
## $ married        <dbl> 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1…
## $ numdep         <dbl> 2, 3, 2, 0, 1, 0, 0, 0, 2, 0, 0, 0, 2, 0, 1, 1, 0, 0, 3…
## $ smsa           <dbl> 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ northcen       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ south          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ west           <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ construc       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ndurman        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ trcommpu       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ trade          <dbl> 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ services       <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ profserv       <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0…
## $ profocc        <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0…
## $ clerocc        <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0…
## $ servocc        <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ lwage          <dbl> 1.1314021, 1.1755733, 1.0986123, 1.7917595, 1.6677068, …
## $ expersq        <dbl> 4, 484, 4, 1936, 49, 81, 225, 25, 676, 484, 64, 9, 225,…
## $ tenursq        <dbl> 0, 4, 0, 784, 4, 64, 49, 9, 16, 441, 4, 0, 0, 9, 225, 0…
## $ female_married <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1…
## $ exper_tenure   <dbl> 0, 44, 0, 1232, 14, 72, 105, 15, 104, 462, 16, 0, 0, 54…
  dplyr::glimpse(wage1$lwage)
##  num [1:526] 1.13 1.18 1.1 1.79 1.67 ...
##  - attr(*, "label")= chr "log(wage)"
##  - attr(*, "format.stata")= chr "%9.0g"
  mean(wage1$educ)
## [1] 12.56274
  median(wage1$educ)
## [1] 12
  min(wage1$educ)
## [1] 0
  max(wage1$educ)
## [1] 18
  range(wage1$educ)
## [1]  0 18
  quantile(wage1$educ)
##   0%  25%  50%  75% 100% 
##    0   12   12   14   18
  quantile(wage1$educ,0.8)
## 80% 
##  15
  IQR(wage1$educ)
## [1] 2
  sd(wage1$educ)
## [1] 2.769022
  var(wage1$educ)
## [1] 7.667485
  lapply(wage1[,1:3],sd)
## $wage
## [1] 3.693086
## 
## $educ
## [1] 2.769022
## 
## $exper
## [1] 13.57216
  summary(wage1$educ)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   12.00   12.00   12.56   14.00   18.00
  by(wage1,wage1$female,summary)
## wage1$female: 0
##       wage             educ           exper           tenure      
##  Min.   : 1.500   Min.   : 2.00   Min.   : 1.00   Min.   : 0.000  
##  1st Qu.: 4.143   1st Qu.:12.00   1st Qu.: 6.00   1st Qu.: 0.000  
##  Median : 6.000   Median :12.00   Median :14.00   Median : 3.000  
##  Mean   : 7.099   Mean   :12.79   Mean   :17.56   Mean   : 6.474  
##  3rd Qu.: 8.765   3rd Qu.:15.00   3rd Qu.:28.00   3rd Qu.: 9.000  
##  Max.   :24.980   Max.   :18.00   Max.   :51.00   Max.   :44.000  
##     nonwhite          female     married           numdep           smsa       
##  Min.   :0.0000   Min.   :0   Min.   :0.0000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0   1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000  
##  Median :0.0000   Median :0   Median :1.0000   Median :0.000   Median :1.0000  
##  Mean   :0.1058   Mean   :0   Mean   :0.6861   Mean   :1.004   Mean   :0.7153  
##  3rd Qu.:0.0000   3rd Qu.:0   3rd Qu.:1.0000   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :0   Max.   :1.0000   Max.   :6.000   Max.   :1.0000  
##     northcen          south             west           construc      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.00000  
##  Mean   :0.2445   Mean   :0.3759   Mean   :0.1496   Mean   :0.06204  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##     ndurman          trcommpu           trade           services      
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.00000   Median :0.0000   Median :0.00000  
##  Mean   :0.1423   Mean   :0.04745   Mean   :0.3102   Mean   :0.06934  
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000  
##     profserv         profocc          clerocc           servocc       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.0000   Median :0.00000   Median :0.00000  
##  Mean   :0.1679   Mean   :0.4489   Mean   :0.04015   Mean   :0.08759  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.00000  
##      lwage           expersq          tenursq       female_married
##  Min.   :0.4055   Min.   :   1.0   Min.   :   0.0   Min.   :0     
##  1st Qu.:1.4212   1st Qu.:  36.0   1st Qu.:   0.0   1st Qu.:0     
##  Median :1.7918   Median : 196.0   Median :   9.0   Median :0     
##  Mean   :1.8136   Mean   : 489.9   Mean   : 111.7   Mean   :0     
##  3rd Qu.:2.1708   3rd Qu.: 784.0   3rd Qu.:  81.0   3rd Qu.:0     
##  Max.   :3.2181   Max.   :2601.0   Max.   :1936.0   Max.   :0     
##   exper_tenure   
##  Min.   :   0.0  
##  1st Qu.:   0.0  
##  Median :  40.0  
##  Mean   : 176.8  
##  3rd Qu.: 182.0  
##  Max.   :2068.0  
## ------------------------------------------------------------ 
## wage1$female: 1
##       wage             educ           exper           tenure      
##  Min.   : 0.530   Min.   : 0.00   Min.   : 1.00   Min.   : 0.000  
##  1st Qu.: 3.000   1st Qu.:12.00   1st Qu.: 5.00   1st Qu.: 0.000  
##  Median : 3.750   Median :12.00   Median :13.00   Median : 2.000  
##  Mean   : 4.588   Mean   :12.32   Mean   :16.43   Mean   : 3.615  
##  3rd Qu.: 5.510   3rd Qu.:13.00   3rd Qu.:26.00   3rd Qu.: 4.000  
##  Max.   :21.630   Max.   :18.00   Max.   :50.00   Max.   :34.000  
##     nonwhite           female     married           numdep     
##  Min.   :0.00000   Min.   :1   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:1   1st Qu.:0.0000   1st Qu.:0.000  
##  Median :0.00000   Median :1   Median :1.0000   Median :1.000  
##  Mean   :0.09921   Mean   :1   Mean   :0.5238   Mean   :1.087  
##  3rd Qu.:0.00000   3rd Qu.:1   3rd Qu.:1.0000   3rd Qu.:2.000  
##  Max.   :1.00000   Max.   :1   Max.   :1.0000   Max.   :5.000  
##       smsa           northcen          south             west       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.7302   Mean   :0.2579   Mean   :0.3333   Mean   :0.1905  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##     construc          ndurman           trcommpu           trade       
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.00000   Median :0.00000   Median :0.0000  
##  Mean   :0.02778   Mean   :0.08333   Mean   :0.03968   Mean   :0.2619  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:1.0000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000   Max.   :1.0000  
##     services         profserv         profocc          clerocc      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.1349   Mean   :0.3571   Mean   :0.2778   Mean   :0.3056  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##     servocc           lwage            expersq          tenursq       
##  Min.   :0.0000   Min.   :-0.6349   Min.   :   1.0   Min.   :   0.00  
##  1st Qu.:0.0000   1st Qu.: 1.0986   1st Qu.:  25.0   1st Qu.:   0.00  
##  Median :0.0000   Median : 1.3218   Median : 169.0   Median :   4.00  
##  Mean   :0.1984   Mean   : 1.4164   Mean   : 455.6   Mean   :  41.66  
##  3rd Qu.:0.0000   3rd Qu.: 1.7065   3rd Qu.: 676.0   3rd Qu.:  16.00  
##  Max.   :1.0000   Max.   : 3.0741   Max.   :2500.0   Max.   :1156.00  
##  female_married    exper_tenure   
##  Min.   :0.0000   Min.   :   0.0  
##  1st Qu.:0.0000   1st Qu.:   0.0  
##  Median :1.0000   Median :  12.5  
##  Mean   :0.5238   Mean   :  91.1  
##  3rd Qu.:1.0000   3rd Qu.:  76.5  
##  Max.   :1.0000   Max.   :1200.0
  summary(wage1,digits=2)
##       wage            educ        exper        tenure        nonwhite  
##  Min.   : 0.53   Min.   : 0   Min.   : 1   Min.   : 0.0   Min.   :0.0  
##  1st Qu.: 3.33   1st Qu.:12   1st Qu.: 5   1st Qu.: 0.0   1st Qu.:0.0  
##  Median : 4.65   Median :12   Median :14   Median : 2.0   Median :0.0  
##  Mean   : 5.90   Mean   :13   Mean   :17   Mean   : 5.1   Mean   :0.1  
##  3rd Qu.: 6.88   3rd Qu.:14   3rd Qu.:26   3rd Qu.: 7.0   3rd Qu.:0.0  
##  Max.   :24.98   Max.   :18   Max.   :51   Max.   :44.0   Max.   :1.0  
##      female        married         numdep       smsa         northcen   
##  Min.   :0.00   Min.   :0.00   Min.   :0   Min.   :0.00   Min.   :0.00  
##  1st Qu.:0.00   1st Qu.:0.00   1st Qu.:0   1st Qu.:0.00   1st Qu.:0.00  
##  Median :0.00   Median :1.00   Median :1   Median :1.00   Median :0.00  
##  Mean   :0.48   Mean   :0.61   Mean   :1   Mean   :0.72   Mean   :0.25  
##  3rd Qu.:1.00   3rd Qu.:1.00   3rd Qu.:2   3rd Qu.:1.00   3rd Qu.:0.75  
##  Max.   :1.00   Max.   :1.00   Max.   :6   Max.   :1.00   Max.   :1.00  
##      south           west         construc        ndurman        trcommpu    
##  Min.   :0.00   Min.   :0.00   Min.   :0.000   Min.   :0.00   Min.   :0.000  
##  1st Qu.:0.00   1st Qu.:0.00   1st Qu.:0.000   1st Qu.:0.00   1st Qu.:0.000  
##  Median :0.00   Median :0.00   Median :0.000   Median :0.00   Median :0.000  
##  Mean   :0.36   Mean   :0.17   Mean   :0.046   Mean   :0.11   Mean   :0.044  
##  3rd Qu.:1.00   3rd Qu.:0.00   3rd Qu.:0.000   3rd Qu.:0.00   3rd Qu.:0.000  
##  Max.   :1.00   Max.   :1.00   Max.   :1.000   Max.   :1.00   Max.   :1.000  
##      trade         services      profserv       profocc        clerocc    
##  Min.   :0.00   Min.   :0.0   Min.   :0.00   Min.   :0.00   Min.   :0.00  
##  1st Qu.:0.00   1st Qu.:0.0   1st Qu.:0.00   1st Qu.:0.00   1st Qu.:0.00  
##  Median :0.00   Median :0.0   Median :0.00   Median :0.00   Median :0.00  
##  Mean   :0.29   Mean   :0.1   Mean   :0.26   Mean   :0.37   Mean   :0.17  
##  3rd Qu.:1.00   3rd Qu.:0.0   3rd Qu.:1.00   3rd Qu.:1.00   3rd Qu.:0.00  
##  Max.   :1.00   Max.   :1.0   Max.   :1.00   Max.   :1.00   Max.   :1.00  
##     servocc         lwage          expersq        tenursq     female_married
##  Min.   :0.00   Min.   :-0.63   Min.   :   1   Min.   :   0   Min.   :0.00  
##  1st Qu.:0.00   1st Qu.: 1.20   1st Qu.:  25   1st Qu.:   0   1st Qu.:0.00  
##  Median :0.00   Median : 1.54   Median : 182   Median :   4   Median :0.00  
##  Mean   :0.14   Mean   : 1.62   Mean   : 473   Mean   :  78   Mean   :0.25  
##  3rd Qu.:0.00   3rd Qu.: 1.93   3rd Qu.: 676   3rd Qu.:  49   3rd Qu.:0.75  
##  Max.   :1.00   Max.   : 3.22   Max.   :2601   Max.   :1936   Max.   :1.00  
##   exper_tenure 
##  Min.   :   0  
##  1st Qu.:   0  
##  Median :  27  
##  Mean   : 136  
##  3rd Qu.: 114  
##  Max.   :2068
# library(pastecs)
stat.desc(wage1)
# Coefficient of variation
sd(wage1$wage)/mean(wage1$wage)
## [1] 0.6263605
# Mode
tab <- table(wage1$educ) # number of occurrences for each unique value
sort(tab, decreasing = TRUE) # sort highest to lowest
## 
##  12  16  14  13  10  11   8  15  18   9  17   6   7   4   0   2   3   5 
## 198  68  53  39  30  29  22  21  19  17  12   6   4   3   2   1   1   1
# or

sort(table(wage1$educ), decreasing = TRUE)
## 
##  12  16  14  13  10  11   8  15  18   9  17   6   7   4   0   2   3   5 
## 198  68  53  39  30  29  22  21  19  17  12   6   4   3   2   1   1   1

Exploratory data analysis

Start discussing the statistics and graphs.

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   12.00   12.00   12.56   14.00   18.00
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

MISSING VALUES

  # vis_dat(wage1)

# vis_miss(nlswork) # ALTERNATIVE

  # gg_miss_upset(wage1)

Graphs

# Following the discussion here: http://www.sthda.com/english/wiki/descriptive-statistics-and-graphics

# Box plots
ggboxplot(wage1, y = "lwage", width = 0.5)

# Histogram
gghistogram(wage1, x = "lwage", bins = 9, 
             add = "mean")
## Warning: `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.
## Warning: `geom_vline()`: Ignoring `data` because `xintercept` was provided.

ggplot(wage1) +
  aes(x = educ) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Empirical cumulative distribution function (ECDF)
## ECDF is the fraction of data smaller than or equal to x
ggecdf(wage1, x = "lwage")

# Q-Q plots
## QQ plots is used to check whether the data is normally distributed
ggqqplot(wage1, x = "exper")
## Warning: The following aesthetics were dropped during statistical transformation: sample
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## Warning: The following aesthetics were dropped during statistical transformation: sample
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

# Using 'dplyr
group_by(wage1, smsa) %>% 
summarise(
  count = n(), 
  mean = mean(lwage, na.rm = TRUE),
  sd = sd(lwage, na.rm = TRUE)
  )
# # Box plot colored by groups: Species
ggboxplot(wage1, x = "smsa", y = "lwage",
          color = "smsa",
          palette = c("#00AFBB", "#E7B800", "#FC4E07"))

ggplot(wage1) +
  aes(group = female, y = lwage) +
  geom_boxplot()

# library(lattice)

dotplot(wage1$exper ~ wage1$tenure)

# Stripchart colored by groups: Species
ggstripchart(wage1, x = "female", y = "tenure",
          color = "female",
          palette = c("#00AFBB", "#E7B800", "#FC4E07"),
          add = "mean_sd")

# Scatterplot
plot(wage1$exper, wage1$lwage)

ggplot(wage1) +
  aes(x=exper,y=lwage) +
  geom_point()

# http://www.sthda.com/english/wiki/ggplot2-scatter-plots-quick-start-guide-r-software-and-data-visualization
ggplot(wage1) +
  aes(x=exper,y=lwage,color=as.factor(female)) +
  geom_point() +
  scale_color_hue()

ggplot(wage1) +
  aes(x=exper,y=lwage,color=as.factor(female),shape=as.factor(female)) +
  geom_point() + 
  geom_smooth(method=lm)
## `geom_smooth()` using formula = 'y ~ x'

# Line plot
plot(wage,type = "l")

# Density
plot(density(wage1$lwage))

# or

ggplot(wage1) +
  aes(x=lwage) +
  geom_density()

# Correlation plot
## EXPLORE: https://statsandr.com/blog/correlogram-in-r-how-to-highlight-the-most-correlated-variables-in-a-dataset/

freq(wage1$female)
## Frequencies  
## wage1$female  
## Label: =1 if female  
## Type: Numeric  
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##           0    274     52.09          52.09     52.09          52.09
##           1    252     47.91         100.00     47.91         100.00
##        <NA>      0                               0.00         100.00
##       Total    526    100.00         100.00    100.00         100.00
# ctable(x=wage1$female,y=wage1$married)
summarytools::descr(wage1)
## Descriptive Statistics  
## wage1  
## N: 526  
## 
##                     clerocc   construc     educ    exper   exper_tenure   expersq   female
## ----------------- --------- ---------- -------- -------- -------------- --------- --------
##              Mean      0.17       0.05    12.56    17.02         135.73    473.44     0.48
##           Std.Dev      0.37       0.21     2.77    13.57         263.57    616.04     0.50
##               Min      0.00       0.00     0.00     1.00           0.00      1.00     0.00
##                Q1      0.00       0.00    12.00     5.00           0.00     25.00     0.00
##            Median      0.00       0.00    12.00    13.50          27.00    182.50     0.00
##                Q3      0.00       0.00    14.00    26.00         114.00    676.00     1.00
##               Max      1.00       1.00    18.00    51.00        2068.00   2601.00     1.00
##               MAD      0.00       0.00     1.48    14.08          40.03    264.64     0.00
##               IQR      0.00       0.00     2.00    21.00         114.00    651.00     1.00
##                CV      2.23       4.58     0.22     0.80           1.94      1.30     1.04
##          Skewness      1.78       4.34    -0.62     0.70           3.09      1.49     0.08
##       SE.Skewness      0.11       0.11     0.11     0.11           0.11      0.11     0.11
##          Kurtosis      1.16      16.89     1.87    -0.65          11.65      1.28    -2.00
##           N.Valid    526.00     526.00   526.00   526.00         526.00    526.00   526.00
##         Pct.Valid    100.00     100.00   100.00   100.00         100.00    100.00   100.00
## 
## Table: Table continues below
## 
##  
## 
##                     female_married    lwage   married   ndurman   nonwhite   northcen   numdep
## ----------------- ---------------- -------- --------- --------- ---------- ---------- --------
##              Mean             0.25     1.62      0.61      0.11       0.10       0.25     1.04
##           Std.Dev             0.43     0.53      0.49      0.32       0.30       0.43     1.26
##               Min             0.00    -0.63      0.00      0.00       0.00       0.00     0.00
##                Q1             0.00     1.20      0.00      0.00       0.00       0.00     0.00
##            Median             0.00     1.54      1.00      0.00       0.00       0.00     1.00
##                Q3             1.00     1.93      1.00      0.00       0.00       1.00     2.00
##               Max             1.00     3.22      1.00      1.00       1.00       1.00     6.00
##               MAD             0.00     0.53      0.00      0.00       0.00       0.00     1.48
##               IQR             0.75     0.73      1.00      0.00       0.00       0.75     2.00
##                CV             1.73     0.33      0.80      2.79       2.96       1.73     1.21
##          Skewness             1.15     0.39     -0.44      2.42       2.61       1.15     1.16
##       SE.Skewness             0.11     0.11      0.11      0.11       0.11       0.11     0.11
##          Kurtosis            -0.69     0.37     -1.81      3.87       4.83      -0.69     0.89
##           N.Valid           526.00   526.00    526.00    526.00     526.00     526.00   526.00
##         Pct.Valid           100.00   100.00    100.00    100.00     100.00     100.00   100.00
## 
## Table: Table continues below
## 
##  
## 
##                     profocc   profserv   services   servocc     smsa    south   tenure   tenursq
## ----------------- --------- ---------- ---------- --------- -------- -------- -------- ---------
##              Mean      0.37       0.26       0.10      0.14     0.72     0.36     5.10     78.15
##           Std.Dev      0.48       0.44       0.30      0.35     0.45     0.48     7.22    199.43
##               Min      0.00       0.00       0.00      0.00     0.00     0.00     0.00      0.00
##                Q1      0.00       0.00       0.00      0.00     0.00     0.00     0.00      0.00
##            Median      0.00       0.00       0.00      0.00     1.00     0.00     2.00      4.00
##                Q3      1.00       1.00       0.00      0.00     1.00     1.00     7.00     49.00
##               Max      1.00       1.00       1.00      1.00     1.00     1.00    44.00   1936.00
##               MAD      0.00       0.00       0.00      0.00     0.00     0.00     2.97      5.93
##               IQR      1.00       1.00       0.00      0.00     1.00     1.00     7.00     49.00
##                CV      1.31       1.70       2.99      2.47     0.62     1.35     1.42      2.55
##          Skewness      0.55       1.10       2.65      2.06    -0.99     0.60     2.10      4.36
##       SE.Skewness      0.11       0.11       0.11      0.11     0.11     0.11     0.11      0.11
##          Kurtosis     -1.70      -0.79       5.01      2.25    -1.02    -1.64     4.63     24.80
##           N.Valid    526.00     526.00     526.00    526.00   526.00   526.00   526.00    526.00
##         Pct.Valid    100.00     100.00     100.00    100.00   100.00   100.00   100.00    100.00
## 
## Table: Table continues below
## 
##  
## 
##                      trade   trcommpu     wage     west
## ----------------- -------- ---------- -------- --------
##              Mean     0.29       0.04     5.90     0.17
##           Std.Dev     0.45       0.20     3.69     0.38
##               Min     0.00       0.00     0.53     0.00
##                Q1     0.00       0.00     3.33     0.00
##            Median     0.00       0.00     4.65     0.00
##                Q3     1.00       0.00     6.88     0.00
##               Max     1.00       1.00    24.98     1.00
##               MAD     0.00       0.00     2.37     0.00
##               IQR     1.00       0.00     3.55     0.00
##                CV     1.58       4.68     0.63     2.22
##          Skewness     0.94       4.45     2.00     1.76
##       SE.Skewness     0.11       0.11     0.11     0.11
##          Kurtosis    -1.12      17.84     4.94     1.10
##           N.Valid   526.00     526.00   526.00   526.00
##         Pct.Valid   100.00     100.00   100.00   100.00
dfSummary(wage1)

And now we report some frequency tables:

table(wage1$female)
tbl1 <- as.data.frame(table(wage1$nonwhite))
tbl1

# Visualize using bar plot
ggbarplot(tbl1, x = "Var1", y = "Freq")

barplot(table(wage1$educ))
barplot(prop.table(table(wage1$educ)))

# Two-way contingency table: Two categorical variables
tbl2 <- table(wage1$female, wage1$married)
tbl2

xtabs(~female + married,data=wage1)

tbl3 <- as.data.frame(tbl2)
colnames(tbl3) <- c('female','married','Freq')
ggbarplot(tbl3, x = "female", y = "Freq",
          color = "married", 
          palette = c("brown", "blue", "gold", "green"))

# position dodge
ggbarplot(tbl3, x = "female", y = "Freq",
          color = "married", position = position_dodge(),
          palette = c("brown", "blue", "gold", "green"))


# library(ggplot2)
ggplot(wage1) +
  aes(x = educ) +
  geom_bar()

# Multiway tables: More than two categorical variables
xtabs(~female + married + nonwhite, data = wage1)

# Compute table margins and relative frequency
margin.table(wage1$female, margin = NULL)
# Margin of rows
margin.table(tbl2, 1)
# Margin of columns
margin.table(tbl2, 2)
# Frequencies relative to row total
prop.table(tbl2, 1)
# Table of percentages
round(prop.table(tbl2, 1), 2)*100

# To express the frequencies relative to the grand total, use this:
tbl2/sum(tbl2)

# Cross-tabulations with ctable()
summarytools::ctable( x = wage1$female, y = wage1$married)

Alternative descriptives

# https://statsandr.com/blog/descriptive-statistics-in-r/

# Contingency table
xtabs(~ wage1$female + wage1$married)
##             wage1$married
## wage1$female   0   1
##            0  86 188
##            1 120 132
# percentages by row:
round(prop.table(table(wage1$female, wage1$married), 1), 2) # round to 2 digits 
##    
##        0    1
##   0 0.31 0.69
##   1 0.48 0.52
# percentages by column:
round(prop.table(table(wage1$female, wage1$married), 2), 2) # round to 2 digits with round()
##    
##        0    1
##   0 0.42 0.59
##   1 0.58 0.41
# Mosaic plot
# A mosaic plot allows to visualize a contingency table of two qualitative variables
mosaicplot(table(wage1$female, wage1$married),
  color = TRUE,
  xlab = "Female", # label for x-axis
  ylab = "Married" # label for y-axis
)

Export output

w1 <- data.frame(wage1)
  summary(w1)
##       wage             educ           exper           tenure      
##  Min.   : 0.530   Min.   : 0.00   Min.   : 1.00   Min.   : 0.000  
##  1st Qu.: 3.330   1st Qu.:12.00   1st Qu.: 5.00   1st Qu.: 0.000  
##  Median : 4.650   Median :12.00   Median :13.50   Median : 2.000  
##  Mean   : 5.896   Mean   :12.56   Mean   :17.02   Mean   : 5.105  
##  3rd Qu.: 6.880   3rd Qu.:14.00   3rd Qu.:26.00   3rd Qu.: 7.000  
##  Max.   :24.980   Max.   :18.00   Max.   :51.00   Max.   :44.000  
##     nonwhite          female          married           numdep     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.000  
##  Median :0.0000   Median :0.0000   Median :1.0000   Median :1.000  
##  Mean   :0.1027   Mean   :0.4791   Mean   :0.6084   Mean   :1.044  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:2.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :6.000  
##       smsa           northcen         south             west       
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.000   Median :0.0000   Median :0.0000  
##  Mean   :0.7224   Mean   :0.251   Mean   :0.3555   Mean   :0.1692  
##  3rd Qu.:1.0000   3rd Qu.:0.750   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##     construc          ndurman          trcommpu           trade       
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.0000   Median :0.00000   Median :0.0000  
##  Mean   :0.04563   Mean   :0.1141   Mean   :0.04373   Mean   :0.2871  
##  3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:1.0000  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
##     services         profserv         profocc          clerocc      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.1008   Mean   :0.2586   Mean   :0.3669   Mean   :0.1673  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##     servocc           lwage            expersq          tenursq       
##  Min.   :0.0000   Min.   :-0.6349   Min.   :   1.0   Min.   :   0.00  
##  1st Qu.:0.0000   1st Qu.: 1.2030   1st Qu.:  25.0   1st Qu.:   0.00  
##  Median :0.0000   Median : 1.5369   Median : 182.5   Median :   4.00  
##  Mean   :0.1407   Mean   : 1.6233   Mean   : 473.4   Mean   :  78.15  
##  3rd Qu.:0.0000   3rd Qu.: 1.9286   3rd Qu.: 676.0   3rd Qu.:  49.00  
##  Max.   :1.0000   Max.   : 3.2181   Max.   :2601.0   Max.   :1936.00  
##  female_married   exper_tenure   
##  Min.   :0.000   Min.   :   0.0  
##  1st Qu.:0.000   1st Qu.:   0.0  
##  Median :0.000   Median :  27.0  
##  Mean   :0.251   Mean   : 135.7  
##  3rd Qu.:0.750   3rd Qu.: 114.0  
##  Max.   :1.000   Max.   :2068.0
  stargazer(w1,
            title = "Summary statistics",
            label = "tb:statistcis",
            table.placement = "ht",
            header=FALSE,type="text")
## 
## Summary statistics
## =================================================
## Statistic       N   Mean   St. Dev.  Min    Max  
## -------------------------------------------------
## wage           526  5.896   3.693   0.530  24.980
## educ           526 12.563   2.769     0      18  
## exper          526 17.017   13.572    1      51  
## tenure         526  5.105   7.224     0      44  
## nonwhite       526  0.103   0.304     0      1   
## female         526  0.479   0.500     0      1   
## married        526  0.608   0.489     0      1   
## numdep         526  1.044   1.262     0      6   
## smsa           526  0.722   0.448     0      1   
## northcen       526  0.251   0.434     0      1   
## south          526  0.356   0.479     0      1   
## west           526  0.169   0.375     0      1   
## construc       526  0.046   0.209     0      1   
## ndurman        526  0.114   0.318     0      1   
## trcommpu       526  0.044   0.205     0      1   
## trade          526  0.287   0.453     0      1   
## services       526  0.101   0.301     0      1   
## profserv       526  0.259   0.438     0      1   
## profocc        526  0.367   0.482     0      1   
## clerocc        526  0.167   0.374     0      1   
## servocc        526  0.141   0.348     0      1   
## lwage          526  1.623   0.532   -0.635 3.218 
## expersq        526 473.435 616.045    1    2,601 
## tenursq        526 78.150  199.435    0    1,936 
## female_married 526  0.251   0.434     0      1   
## exper_tenure   526 135.728 263.573    0    2,068 
## -------------------------------------------------

Regression analysis

reg1 <- lm(data = wage1,lwage ~ educ + female)
summary(reg1)
## 
## Call:
## lm(formula = lwage ~ educ + female, data = wage1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.02672 -0.27470 -0.03731  0.26219  1.34738 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.826269   0.094054   8.785   <2e-16 ***
## educ         0.077203   0.007047  10.955   <2e-16 ***
## female      -0.360865   0.039024  -9.247   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4455 on 523 degrees of freedom
## Multiple R-squared:  0.3002, Adjusted R-squared:  0.2975 
## F-statistic: 112.2 on 2 and 523 DF,  p-value: < 2.2e-16
reg2 <- lm(data = wage1,lwage ~ educ + female + married + nonwhite + exper + expersq)
summary(reg2)
## 
## Call:
## lm(formula = lwage ~ educ + female + married + nonwhite + exper + 
##     expersq, data = wage1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.79493 -0.27927 -0.01887  0.26057  1.27372 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.3954849  0.1030918   3.836  0.00014 ***
## educ         0.0826774  0.0070297  11.761  < 2e-16 ***
## female      -0.3293054  0.0367083  -8.971  < 2e-16 ***
## married      0.0638034  0.0422190   1.511  0.13133    
## nonwhite    -0.0147385  0.0597847  -0.247  0.80537    
## exper        0.0358161  0.0052508   6.821 2.52e-11 ***
## expersq     -0.0006334  0.0001131  -5.600 3.48e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4133 on 519 degrees of freedom
## Multiple R-squared:  0.4024, Adjusted R-squared:  0.3955 
## F-statistic: 58.24 on 6 and 519 DF,  p-value: < 2.2e-16

Export table with regression output

  stargazer(reg1,reg2,title = "Regression analysis", 
            model.numbers = FALSE,
            column.labels = c("OLS","Pooled"),
            label = "regressions",
            table.placement = "!ht",
            notes.append = FALSE,
            notes.align="l",
            notes="Standard errors in parentheses.",
            header = FALSE,
            no.space = TRUE,
            covariate.labels = c("Education (years)","Female","Married","Experience (years)",
                                 "Experience sqrd."),
            omit = c("Constant"),
            omit.stat = c("adj.rsq","f","ser"),
            digits = 3,
            digits.extra = 5,
            omit.yes.no = c("Constant",""),
            dep.var.caption="",
            dep.var.labels.include = FALSE,
            style = "qje",
            type="text")
## 
## Regression analysis
## ===================================================
##                          OLS            Pooled     
## Education (years)      0.077***        0.083***    
##                        (0.007)          (0.007)    
## Female                -0.361***        -0.329***   
##                        (0.039)          (0.037)    
## Married                                  0.064     
##                                         (0.042)    
## Experience (years)                      -0.015     
##                                         (0.060)    
## Experience sqrd.                       0.036***    
##                                         (0.005)    
## expersq                                -0.001***   
##                                        (0.0001)    
## N                        526              526      
## R2                      0.300            0.402     
## ===================================================
## Notes:             Standard errors in parentheses.

Specification tests

# Heteroskedasticity
  bptest(data = wage1, lwage ~ educ, studentize=F)
## 
##  Breusch-Pagan test
## 
## data:  lwage ~ educ
## BP = 0.95939, df = 1, p-value = 0.3273
# Multicoliniarity
car::vif(reg2)
##      educ    female   married  nonwhite     exper   expersq 
##  1.164649  1.035618  1.307830  1.013994 15.610357 14.925842

PANEL DATA

Panel data – prepare the data

Here I am reading a Stata data file (code omitted).

Statistics

  names(nlswork)
##  [1] "idcode"   "year"     "birth_yr" "age"      "race"     "msp"     
##  [7] "nev_mar"  "grade"    "collgrad" "not_smsa" "c_city"   "south"   
## [13] "ind_code" "occ_code" "union"    "wks_ue"   "ttl_exp"  "tenure"  
## [19] "hours"    "wks_work" "ln_wage"
  head(nlswork)
  # str(nlswork)
  # dplyr::glimpse(nlswork)
  
  dplyr::glimpse(nlswork$ln_wage)
##  num [1:28534] 1.45 1.03 1.59 1.78 1.78 ...
##  - attr(*, "label")= chr "ln(wage/GNP deflator)"
##  - attr(*, "format.stata")= chr "%9.0g"
  ExpData(nlswork,type=1)
  ExpData(nlswork,type=2)

Exploratory data analysis

Start discussing the statistics and graphs.

##      grade      
##  Min.   : 0.00  
##  1st Qu.:12.00  
##  Median :12.00  
##  Mean   :12.53  
##  3rd Qu.:14.00  
##  Max.   :18.00  
##  NA's   :2
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## $`0`

MISSING VALUES

  vis_dat(nlswork)
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## ℹ Please use `gather()` instead.
## ℹ The deprecated feature was likely used in the visdat package.
##   Please report the issue at <]8;;https://github.com/ropensci/visdat/issueshttps://github.com/ropensci/visdat/issues]8;;>.

# vis_miss(nlswork) # ALTERNATIVE

  gg_miss_upset(nlswork)

Add a dataframe

Export output

  stargazer(nls,
            title = "Summary statistics",
            label = "tb:statistcis",
            table.placement = "ht",
            header=FALSE,type="text")
## 
## Summary statistics
## =================================================
## Statistic   N      Mean    St. Dev.   Min   Max  
## -------------------------------------------------
## idcode    28,534 2,601.284 1,487.359   1   5,159 
## year      28,534  77.959     6.384    68     88  
## birth_yr  28,534  48.085     3.013    41     54  
## age       28,510  29.045     6.701    14     46  
## race      28,534   1.303     0.482     1     3   
## msp       28,518   0.603     0.489     0     1   
## nev_mar   28,518   0.230     0.421     0     1   
## grade     28,532  12.533     2.324     0     18  
## collgrad  28,534   0.168     0.374     0     1   
## not_smsa  28,526   0.282     0.450     0     1   
## c_city    28,526   0.357     0.479     0     1   
## south     28,526   0.410     0.492     0     1   
## ind_code  28,193   7.693     2.994     1     12  
## occ_code  28,413   4.778     3.065     1     13  
## union     19,238   0.234     0.424     0     1   
## wks_ue    22,830   2.548     7.294     0     76  
## ttl_exp   28,534   6.215     4.652   0.000 28.885
## tenure    28,101   3.124     3.751   0.000 25.917
## hours     28,467  36.560     9.870     1    168  
## wks_work  27,831  53.989    29.032     0    104  
## ln_wage   28,534   1.675     0.478   0.000 5.264 
## -------------------------------------------------

Produce a graph

plot(nlswork$grade,nlswork$ln_wage)

Save the graph

## Warning: Removed 2 rows containing missing values (`geom_point()`).

## Saving 7 x 5 in image
## Warning: Removed 2 rows containing missing values (`geom_point()`).

Tabulations and further statistics

## Frequencies  
## nlswork$year  
## Label: interview year  
## Type: Numeric  
## 
##                Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------- --------- -------------- --------- --------------
##          88    2272      7.96           7.96      7.96           7.96
##          77    2171      7.61          15.57      7.61          15.57
##          87    2164      7.58          23.15      7.58          23.15
##          75    2141      7.50          30.66      7.50          30.66
##          82    2085      7.31          37.97      7.31          37.97
##          85    2085      7.31          45.27      7.31          45.27
##          83    1987      6.96          52.24      6.96          52.24
##          73    1981      6.94          59.18      6.94          59.18
##          78    1964      6.88          66.06      6.88          66.06
##          71    1851      6.49          72.55      6.49          72.55
##          80    1847      6.47          79.02      6.47          79.02
##          72    1693      5.93          84.95      5.93          84.95
##          70    1686      5.91          90.86      5.91          90.86
##          68    1375      4.82          95.68      4.82          95.68
##          69    1232      4.32         100.00      4.32         100.00
##        <NA>       0                               0.00         100.00
##       Total   28534    100.00         100.00    100.00         100.00
## Frequencies  
## nlswork$race  
## Label: 1=white, 2=black, 3=other  
## Type: Numeric  
## 
##                Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------- --------- -------------- --------- --------------
##           1   20180     70.72          70.72     70.72          70.72
##           2    8051     28.22          98.94     28.22          98.94
##           3     303      1.06         100.00      1.06         100.00
##        <NA>       0                               0.00         100.00
##       Total   28534    100.00         100.00    100.00         100.00

A new dataset: exclude onservations with missing information in a subset of variables

Add variable

Plot variables means

Regression analysis

Q1

Pooled OLS model

Export regression output

## 
## Regression analysis
## =================================================
##                        OLS             panel     
##                                       linear     
##                        OLS            Pooled     
## -------------------------------------------------
## Union                0.113***        0.113***    
##                      (0.007)          (0.007)    
## Collage Graduate     0.351***        0.351***    
##                      (0.007)          (0.007)    
## Age                  0.022***        0.022***    
##                      (0.004)          (0.004)    
## Age sqrd.           -0.0003***      -0.0003***   
##                      (0.0001)        (0.0001)    
## Tenure               0.055***        0.055***    
##                      (0.002)          (0.002)    
## Tenure sqrd.        -0.002***        -0.002***   
##                      (0.0001)        (0.0001)    
## Not SMSA            -0.205***        -0.205***   
##                      (0.007)          (0.007)    
## South               -0.141***        -0.141***   
##                      (0.006)          (0.006)    
## City                -0.032***        -0.032***   
##                      (0.007)          (0.007)    
## N                     19,007          19,007     
## R2                    0.319            0.319     
## =================================================
## Notes:           Standard errors in parentheses.
  • SMSA: Standard Metropolitan Statistical Area
# ftable(c_city) # 1 if central city

CLUSTERED Standard-errors

  pols_robust <- coeftest(pols, function(x) vcovHC(x, type = 'sss')) 
  
  stargazer(pols,pols_robust,title = "Regression analysis", 
            model.numbers = FALSE,
            column.labels = c("Pooled","Pooled (cluster)"),
            label = "regressions",
            table.placement = "!ht",
            notes.append = FALSE,
            notes.align="l",
            notes="Standard errors in parentheses.",
            header = FALSE,
            no.space = TRUE,
            covariate.labels = c("Union","Collage graduate","Age","Age sqrd.",
                                 "Tenure","Tenure sqrd.","Not SMSA","South","City"),
            omit = c("Constant"),
            omit.stat = c("adj.rsq","f","ser"),
            digits = 6,
            digits.extra = 7,
            omit.yes.no = c("Constant",""),
            dep.var.caption="",
            dep.var.labels.include = FALSE,
            style = "qje",
            type="text")
## 
## Regression analysis
## =================================================
##                       panel        coefficient   
##                      linear            test      
##                      Pooled      Pooled (cluster)
## -------------------------------------------------
## Union              0.112774***     0.112774***   
##                    (0.006774)       (0.011731)   
## Collage graduate   0.350946***     0.350946***   
##                    (0.007149)       (0.014112)   
## Age                0.022481***     0.022481***   
##                    (0.004249)       (0.005373)   
## Age sqrd.         -0.000306***     -0.000306***  
##                    (0.000068)       (0.000088)   
## Tenure             0.054787***     0.054787***   
##                    (0.001944)       (0.002743)   
## Tenure sqrd.      -0.001540***     -0.001540***  
##                    (0.000125)       (0.000180)   
## Not SMSA          -0.205457***     -0.205457***  
##                    (0.007120)       (0.013137)   
## South             -0.140589***     -0.140589***  
##                    (0.005850)       (0.010964)   
## City              -0.031543***     -0.031543***  
##                    (0.006683)       (0.011691)   
## N                    19,007                      
## R2                  0.319459                     
## =================================================
## Notes:           Standard errors in parentheses.

Q2

Random effects estimator (RE)

  • SEE THE DISCUSSION HERE for the comparison between R and Stata

https://stats.stackexchange.com/questions/421374/different-results-from-random-effects-plm-r-and-xtreg-stata

for a balanced panel we have

  nlswork_balanced <- read_dta("data/nlswork_balanced.dta")
  
  re_balanced <- plm(data = nlswork_balanced, ln_wage ~ union +
                       collgrad +age +agesq +tenure +tensq +
                       not_smsa +south +c_city, model="random",
                     index=c("idcode", "year"))
      summary(re_balanced)
## Oneway (individual) effect Random Effect Model 
##    (Swamy-Arora's transformation)
## 
## Call:
## plm(formula = ln_wage ~ union + collgrad + age + agesq + tenure + 
##     tensq + not_smsa + south + c_city, data = nlswork_balanced, 
##     model = "random", index = c("idcode", "year"))
## 
## Balanced Panel: n = 53, T = 12, N = 636
## 
## Effects:
##                   var std.dev share
## idiosyncratic 0.03698 0.19230 0.319
## individual    0.07902 0.28110 0.681
## theta: 0.8063
## 
## Residuals:
##      Min.   1st Qu.    Median   3rd Qu.      Max. 
## -0.904519 -0.104813  0.015673  0.114854  0.658580 
## 
## Coefficients:
##                Estimate  Std. Error z-value  Pr(>|z|)    
## (Intercept)  1.05650211  0.20514289  5.1501 2.604e-07 ***
## union        0.06087668  0.02949229  2.0642 0.0390030 *  
## collgrad     0.20128253  0.17651938  1.1403 0.2541673    
## age          0.04592525  0.01334703  3.4409 0.0005799 ***
## agesq       -0.00051693  0.00020986 -2.4632 0.0137701 *  
## tenure       0.00332254  0.00616575  0.5389 0.5899766    
## tensq        0.00011290  0.00028483  0.3964 0.6918193    
## not_smsa    -0.29590935  0.07567198 -3.9104 9.214e-05 ***
## south       -0.06346941  0.06258349 -1.0142 0.3105084    
## c_city       0.00068168  0.03609511  0.0189 0.9849322    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    32.65
## Residual Sum of Squares: 23.689
## R-Squared:      0.27447
## Adj. R-Squared: 0.26404
## Chisq: 236.819 on 9 DF, p-value: < 2.22e-16
  re <- plm(data = nlswork_clean, ln_wage ~ union +
              collgrad +age +agesq +tenure +tensq +
              not_smsa +south +c_city, model="random",
            index=c("idcode", "year"))
      summary(re)
## Oneway (individual) effect Random Effect Model 
##    (Swamy-Arora's transformation)
## 
## Call:
## plm(formula = ln_wage ~ union + collgrad + age + agesq + tenure + 
##     tensq + not_smsa + south + c_city, data = nlswork_clean, 
##     model = "random", index = c("idcode", "year"))
## 
## Unbalanced Panel: n = 4134, T = 1-12, N = 19007
## 
## Effects:
##                   var std.dev share
## idiosyncratic 0.06476 0.25448 0.444
## individual    0.08108 0.28475 0.556
## theta:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3336  0.5920  0.6572  0.6406  0.6987  0.7502 
## 
## Residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.77886 -0.13081  0.00854  0.00428  0.14314  3.03289 
## 
## Coefficients:
##                Estimate  Std. Error  z-value  Pr(>|z|)    
## (Intercept)  1.1369e+00  5.0729e-02  22.4103 < 2.2e-16 ***
## union        1.0368e-01  6.4468e-03  16.0819 < 2.2e-16 ***
## collgrad     3.6931e-01  1.2344e-02  29.9171 < 2.2e-16 ***
## age          2.3031e-02  3.3174e-03   6.9424 3.856e-12 ***
## agesq       -2.4910e-04  5.3181e-05  -4.6839 2.814e-06 ***
## tenure       4.0771e-02  1.5826e-03  25.7618 < 2.2e-16 ***
## tensq       -1.2466e-03  1.0022e-04 -12.4393 < 2.2e-16 ***
## not_smsa    -1.5114e-01  9.3804e-03 -16.1119 < 2.2e-16 ***
## south       -1.1157e-01  8.4671e-03 -13.1766 < 2.2e-16 ***
## c_city       3.9713e-04  7.4923e-03   0.0530    0.9577    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    1942.3
## Residual Sum of Squares: 1268
## R-Squared:      0.34903
## Adj. R-Squared: 0.34872
## Chisq: 4820.91 on 9 DF, p-value: < 2.22e-16
  re_robust <- coeftest(re, function(x) vcovHC(x, type = 'sss')) 
  stargazer(pols,pols_robust,re,re_robust,title = "Regression analysis", 
            model.numbers = FALSE,
            column.labels = c("Pooled","Pooled (cluster)","RE","RE (cluster"),
            label = "regressions",
            table.placement = "!ht",
            notes.append = FALSE,
            notes.align="l",
            notes="Standard errors in parentheses.",
            header = FALSE,
            no.space = TRUE,
            covariate.labels = c("Union","College Graduate","Age","Age sqrd.",
                                 "Tenure","Tenure sqrd.","Not SMSA","South","City"),
            omit = c("Constant"),
            omit.stat = c("adj.rsq","f","ser"),
            digits = 3,
            digits.extra = 5,
            omit.yes.no = c("Constant",""),
            dep.var.caption="",
            dep.var.labels.include = FALSE,
            style = "qje",
            type="text")
## 
## Regression analysis
## ===================================================================
##                    panel      coefficient      panel    coefficient
##                    linear         test         linear      test    
##                    Pooled   Pooled (cluster)     RE     RE (cluster
## -------------------------------------------------------------------
## Union             0.113***      0.113***      0.104***   0.104***  
##                   (0.007)       (0.012)       (0.006)     (0.009)  
## College Graduate  0.351***      0.351***      0.369***   0.369***  
##                   (0.007)       (0.014)       (0.012)     (0.013)  
## Age               0.022***      0.022***      0.023***   0.023***  
##                   (0.004)       (0.005)       (0.003)     (0.005)  
## Age sqrd.        -0.0003***    -0.0003***    -0.0002*** -0.0002*** 
##                   (0.0001)      (0.0001)      (0.0001)   (0.0001)  
## Tenure            0.055***      0.055***      0.041***   0.041***  
##                   (0.002)       (0.003)       (0.002)     (0.002)  
## Tenure sqrd.     -0.002***     -0.002***     -0.001***   -0.001*** 
##                   (0.0001)      (0.0002)      (0.0001)   (0.0001)  
## Not SMSA         -0.205***     -0.205***     -0.151***   -0.151*** 
##                   (0.007)       (0.013)       (0.009)     (0.012)  
## South            -0.141***     -0.141***     -0.112***   -0.112*** 
##                   (0.006)       (0.011)       (0.008)     (0.011)  
## City             -0.032***     -0.032***       0.0004     0.0004   
##                   (0.007)       (0.012)       (0.007)     (0.010)  
## N                  19,007                      19,007              
## R2                 0.319                       0.349               
## ===================================================================
## Notes:           Standard errors in parentheses.
  stargazer(pols,pols_robust,re,re_robust,title = "Regression analysis", 
            model.numbers = FALSE,
            column.labels = c("Pooled","Pooled (cluster)","RE","RE (cluster"),
            label = "regressions",
            table.placement = "!ht",
            notes.append = FALSE,
            notes.align="l",
            notes="Standard errors in parentheses.",
            header = FALSE,
            no.space = TRUE,
            covariate.labels = c("Union","College Graduate","Age","Age sqrd.",
                                 "Tenure","Tenure sqrd.","Not SMSA","South","City"),
            omit = c("Constant"),
            omit.stat = c("adj.rsq","f","ser"),
            digits = 3,
            digits.extra = 5,
            omit.yes.no = c("Constant",""),
            dep.var.caption="",
            dep.var.labels.include = FALSE,
            style = "qje",
            type="text")
## 
## Regression analysis
## ===================================================================
##                    panel      coefficient      panel    coefficient
##                    linear         test         linear      test    
##                    Pooled   Pooled (cluster)     RE     RE (cluster
## -------------------------------------------------------------------
## Union             0.113***      0.113***      0.104***   0.104***  
##                   (0.007)       (0.012)       (0.006)     (0.009)  
## College Graduate  0.351***      0.351***      0.369***   0.369***  
##                   (0.007)       (0.014)       (0.012)     (0.013)  
## Age               0.022***      0.022***      0.023***   0.023***  
##                   (0.004)       (0.005)       (0.003)     (0.005)  
## Age sqrd.        -0.0003***    -0.0003***    -0.0002*** -0.0002*** 
##                   (0.0001)      (0.0001)      (0.0001)   (0.0001)  
## Tenure            0.055***      0.055***      0.041***   0.041***  
##                   (0.002)       (0.003)       (0.002)     (0.002)  
## Tenure sqrd.     -0.002***     -0.002***     -0.001***   -0.001*** 
##                   (0.0001)      (0.0002)      (0.0001)   (0.0001)  
## Not SMSA         -0.205***     -0.205***     -0.151***   -0.151*** 
##                   (0.007)       (0.013)       (0.009)     (0.012)  
## South            -0.141***     -0.141***     -0.112***   -0.112*** 
##                   (0.006)       (0.011)       (0.008)     (0.011)  
## City             -0.032***     -0.032***       0.0004     0.0004   
##                   (0.007)       (0.012)       (0.007)     (0.010)  
## N                  19,007                      19,007              
## R2                 0.319                       0.349               
## ===================================================================
## Notes:           Standard errors in parentheses.

LM test for the presence of unobserved effects

  plmtest(pols, type=c("bp"))
## 
##  Lagrange Multiplier Test - (Breusch-Pagan)
## 
## data:  ln_wage ~ union + collgrad + age + agesq + tenure + tensq + not_smsa +  ...
## chisq = 14041, df = 1, p-value < 2.2e-16
## alternative hypothesis: significant effects
  kable(tidy(plmtest(pols, type=c("bp"))), format = "simple",caption=
          "LM test for the presence of unobserved effects")
LM test for the presence of unobserved effects
statistic p.value parameter method alternative
14041.19 0 1 Lagrange Multiplier Test - (Breusch-Pagan) significant effects

Fixed effects estimator (FE)

# //Final slide 32
# 
# *Q3*
#

  fe <- plm(data = nlswork_clean, ln_wage ~ union +
              collgrad +age +agesq +tenure +tensq +
              not_smsa +south +c_city, model="within", index=c("idcode", "year"))
      
      summary(fe)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = ln_wage ~ union + collgrad + age + agesq + tenure + 
##     tensq + not_smsa + south + c_city, data = nlswork_clean, 
##     model = "within", index = c("idcode", "year"))
## 
## Unbalanced Panel: n = 4134, T = 1-12, N = 19007
## 
## Residuals:
##     Min.  1st Qu.   Median  3rd Qu.     Max. 
## -1.88027 -0.10216  0.00000  0.10774  2.80710 
## 
## Coefficients:
##             Estimate  Std. Error  t-value  Pr(>|t|)    
## union     9.3877e-02  6.9662e-03  13.4761 < 2.2e-16 ***
## age       2.4259e-02  3.4467e-03   7.0383 2.031e-12 ***
## agesq    -2.2618e-04  5.5316e-05  -4.0890 4.356e-05 ***
## tenure    3.2966e-02  1.6465e-03  20.0218 < 2.2e-16 ***
## tensq    -1.1002e-03  1.0291e-04 -10.6916 < 2.2e-16 ***
## not_smsa -9.3105e-02  1.2970e-02  -7.1787 7.372e-13 ***
## south    -6.3222e-02  1.3279e-02  -4.7611 1.944e-06 ***
## c_city    1.1409e-02  8.8964e-03   1.2824    0.1997    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    1119.5
## Residual Sum of Squares: 962.69
## R-Squared:      0.14005
## Adj. R-Squared: -0.099505
## F-statistic: 302.62 on 8 and 14865 DF, p-value: < 2.22e-16
  stargazer(pols,re,fe,title = "Regression analysis", 
            model.numbers = FALSE,
            column.labels = c("Pooled","RE","FE"),
            label = "regressions",
            table.placement = "!ht",
            notes.append = FALSE,
            notes.align="l",
            notes="Standard errors in parentheses.",
            header = FALSE,
            no.space = TRUE,
            covariate.labels = c("Union","College","Age","Age sqrd.","Tenure",
                                 "Tenure sqrd.","Not SMSA","South","City"),
            omit = c("Constant"),
            omit.stat = c("adj.rsq","f","ser"),
            digits = 6,
            digits.extra = 7,
            omit.yes.no = c("Constant",""),
            dep.var.caption="",
            dep.var.labels.include = FALSE,
            style = "qje",
            type="text")
## 
## Regression analysis
## ===================================================
##                 Pooled         RE           FE     
## Union        0.112774***  0.103677***  0.093877*** 
##               (0.006774)   (0.006447)   (0.006966) 
## College      0.350946***  0.369309***              
##               (0.007149)   (0.012344)              
## Age          0.022481***  0.023031***  0.024259*** 
##               (0.004249)   (0.003317)   (0.003447) 
## Age sqrd.    -0.000306*** -0.000249*** -0.000226***
##               (0.000068)   (0.000053)   (0.000055) 
## Tenure       0.054787***  0.040771***  0.032966*** 
##               (0.001944)   (0.001583)   (0.001646) 
## Tenure sqrd. -0.001540*** -0.001247*** -0.001100***
##               (0.000125)   (0.000100)   (0.000103) 
## Not SMSA     -0.205457*** -0.151136*** -0.093105***
##               (0.007120)   (0.009380)   (0.012970) 
## South        -0.140589*** -0.111567*** -0.063222***
##               (0.005850)   (0.008467)   (0.013279) 
## City         -0.031543***   0.000397     0.011409  
##               (0.006683)   (0.007492)   (0.008896) 
## N               19,007       19,007       19,007   
## R2             0.319459     0.349029     0.140054  
## ===================================================
## Notes:       Standard errors in parentheses.
# Testing for fixed effects, null: OLS better than fixed
# 'F test for individual effects' <<==>> 'F test that all u_i=0'

  ols_0 <- lm(data = nlswork_clean, ln_wage ~ union +
                age +agesq +tenure +tensq +
                not_smsa +south +c_city)
      summary(ols_0)
## 
## Call:
## lm(formula = ln_wage ~ union + age + agesq + tenure + tensq + 
##     not_smsa + south + c_city, data = nlswork_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7497 -0.2508 -0.0182  0.2379  3.4100 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.008e+00  6.821e-02  14.776  < 2e-16 ***
## union        1.294e-01  7.181e-03  18.026  < 2e-16 ***
## age          3.929e-02  4.495e-03   8.740  < 2e-16 ***
## agesq       -5.387e-04  7.175e-05  -7.509 6.24e-14 ***
## tenure       5.806e-02  2.062e-03  28.158  < 2e-16 ***
## tensq       -1.699e-03  1.331e-04 -12.765  < 2e-16 ***
## not_smsa    -2.311e-01  7.538e-03 -30.657  < 2e-16 ***
## south       -1.475e-01  6.208e-03 -23.762  < 2e-16 ***
## c_city      -3.451e-02  7.094e-03  -4.864 1.16e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4091 on 18998 degrees of freedom
## Multiple R-squared:  0.2331, Adjusted R-squared:  0.2328 
## F-statistic: 721.9 on 8 and 18998 DF,  p-value: < 2.2e-16
  pFtest(fe, ols_0)
## 
##  F test for individual effects
## 
## data:  ln_wage ~ union + collgrad + age + agesq + tenure + tensq + not_smsa +  ...
## F = 8.2833, df1 = 4133, df2 = 14865, p-value < 2.2e-16
## alternative hypothesis: significant effects
# generate fixed-effects

# nlswork_clean$specific_effects <- fixef(fe)

# *Q3.1*

  fe_robust <- coeftest(fe, function(x) vcovHC(x, type = 'sss')) 

  stargazer(ols_0,fe,fe_robust,title = "Regression analysis", 
            model.numbers = FALSE,
            column.labels = c("OLS","FE","FE (cluster)"),
            label = "regressions",
            table.placement = "!ht",
            notes.append = FALSE,
            notes.align="l",
            notes="Standard errors in parentheses.",
            header = FALSE,
            no.space = TRUE,
            covariate.labels = c("Union","Age","Age sqrd.","Tenure",
                                 "Tenure sqrd.","Not SMSA","South","City"),
            omit = c("Constant"),
            omit.stat = c("adj.rsq","f","ser"),
            digits = 6,
            digits.extra = 7,
            omit.yes.no = c("Constant",""),
            dep.var.caption="",
            dep.var.labels.include = FALSE,
            style = "qje",
            type="text")
## 
## Regression analysis
## ===================================================
##                  OLS         panel     coefficient 
##                              linear        test    
##                  OLS           FE      FE (cluster)
## ---------------------------------------------------
## Union        0.129446***  0.093877***  0.093877*** 
##               (0.007181)   (0.006966)   (0.009565) 
## Age          0.039289***  0.024259***  0.024259*** 
##               (0.004495)   (0.003447)   (0.005008) 
## Age sqrd.    -0.000539*** -0.000226*** -0.000226***
##               (0.000072)   (0.000055)   (0.000081) 
## Tenure       0.058063***  0.032966***  0.032966*** 
##               (0.002062)   (0.001646)   (0.002085) 
## Tenure sqrd. -0.001699*** -0.001100*** -0.001100***
##               (0.000133)   (0.000103)   (0.000126) 
## Not SMSA     -0.231095*** -0.093105*** -0.093105***
##               (0.007538)   (0.012970)   (0.019790) 
## South        -0.147518*** -0.063222*** -0.063222***
##               (0.006208)   (0.013279)   (0.021653) 
## City         -0.034506***   0.011409     0.011409  
##               (0.007094)   (0.008896)   (0.012605) 
## N               19,007       19,007                
## R2             0.233124     0.140054               
## ===================================================
## Notes:       Standard errors in parentheses.
# *Q3.2*

  linearHypothesis(ols,c("age=0","agesq=0"))
  linearHypothesis(ols,c("age=0","agesq=0"), white.adjust = "hc1")
  Wald_test(fe, vcov = "CR1", cluster = nlswork_clean$idcode, 
             constraints = constrain_zero(c("age","agesq")), test = "Naive-F")

LSDV estimator

# *LSDV Estimator=FE estimator* <<==>> takes too long
# *using a smaller sample*

nlswork_balanced <- read_dta("data/nlswork_balanced_small.dta")

  LSDV <- lm(data = nlswork_balanced, ln_wage ~ union +
               age +agesq +tenure +tensq +
               not_smsa +south +c_city + factor(idcode))
  summary(LSDV)
## 
## Call:
## lm(formula = ln_wage ~ union + age + agesq + tenure + tensq + 
##     not_smsa + south + c_city + factor(idcode), data = nlswork_balanced)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.254362 -0.069457  0.004535  0.078344  0.239917 
## 
## Coefficients: (2 not defined because of singularities)
##                     Estimate Std. Error t value Pr(>|t|)  
## (Intercept)        0.5332323  0.4915522   1.085   0.2833  
## union              0.0895838  0.0548743   1.633   0.1090  
## age                0.0758967  0.0334355   2.270   0.0276 *
## agesq             -0.0011959  0.0005405  -2.212   0.0316 *
## tenure             0.0125105  0.0162130   0.772   0.4440  
## tensq              0.0006329  0.0007176   0.882   0.3821  
## not_smsa                  NA         NA      NA       NA  
## south                     NA         NA      NA       NA  
## c_city             0.1164782  0.0875330   1.331   0.1895  
## factor(idcode)20   0.3287869  0.1289784   2.549   0.0140 *
## factor(idcode)126 -0.0167263  0.0546551  -0.306   0.7609  
## factor(idcode)128  0.0375653  0.0882907   0.425   0.6724  
## factor(idcode)379 -0.1504303  0.1188913  -1.265   0.2118  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1149 on 49 degrees of freedom
## Multiple R-squared:  0.8021, Adjusted R-squared:  0.7617 
## F-statistic: 19.86 on 10 and 49 DF,  p-value: 5.49e-14
  fe_LSDV <- plm(data = nlswork_balanced, ln_wage ~ union +
              age +agesq +tenure +tensq +
              not_smsa +south +c_city, model="within", index=c("idcode", "year"))
  summary(fe_LSDV)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = ln_wage ~ union + age + agesq + tenure + tensq + 
##     not_smsa + south + c_city, data = nlswork_balanced, model = "within", 
##     index = c("idcode", "year"))
## 
## Balanced Panel: n = 5, T = 12, N = 60
## 
## Residuals:
##       Min.    1st Qu.     Median    3rd Qu.       Max. 
## -0.2543619 -0.0694565  0.0045351  0.0783435  0.2399171 
## 
## Coefficients:
##           Estimate  Std. Error t-value Pr(>|t|)  
## union   0.08958377  0.05487432  1.6325  0.10898  
## age     0.07589667  0.03343547  2.2699  0.02765 *
## agesq  -0.00119586  0.00054052 -2.2124  0.03163 *
## tenure  0.01251051  0.01621304  0.7716  0.44404  
## tensq   0.00063295  0.00071759  0.8821  0.38206  
## c_city  0.11647818  0.08753305  1.3307  0.18945  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    2.3839
## Residual Sum of Squares: 0.64685
## R-Squared:      0.72866
## Adj. R-Squared: 0.67329
## F-statistic: 21.9312 on 6 and 49 DF, p-value: 2.4403e-12
  stargazer(LSDV,fe_LSDV,title = "Regression analysis", 
            model.numbers = FALSE,
            column.labels = c("LSDV","FE"),
            label = "regressions",
            table.placement = "!ht",
            notes.append = FALSE,
            notes.align="l",
            notes="Standard errors in parentheses.",
            header = FALSE,
            no.space = TRUE,
            covariate.labels = c("Union","Age","Age sqrd.","Tenure",
                                 "Tenure sqrd.","Not SMSA","South","City"),
            omit = c("Constant"),
            omit.stat = c("adj.rsq","f","ser"),
            digits = 6,
            digits.extra = 7,
            omit.yes.no = c("Constant",""),
            dep.var.caption="",
            dep.var.labels.include = FALSE,
            style = "qje",
            type="text")
## 
## Regression analysis
## ==================================================
##                         OLS             panel     
##                                        linear     
##                         LSDV             FE       
## --------------------------------------------------
## Union                 0.089584        0.089584    
##                      (0.054874)      (0.054874)   
## Age                  0.075897**      0.075897**   
##                      (0.033435)      (0.033435)   
## Age sqrd.           -0.001196**      -0.001196**  
##                      (0.000541)      (0.000541)   
## Tenure                0.012511        0.012511    
##                      (0.016213)      (0.016213)   
## Tenure sqrd.          0.000633        0.000633    
##                      (0.000718)      (0.000718)   
## Not SMSA                                          
##                                                   
## South                                             
##                                                   
## City                  0.116478        0.116478    
##                      (0.087533)      (0.087533)   
## factor(idcode)20     0.328787**                   
##                      (0.128978)                   
## factor(idcode)126    -0.016726                    
##                      (0.054655)                   
## factor(idcode)128     0.037565                    
##                      (0.088291)                   
## factor(idcode)379    -0.150430                    
##                      (0.118891)                   
## N                        60              60       
## R2                    0.802119        0.728663    
## ==================================================
## Notes:            Standard errors in parentheses.

Hausman test

# //Final slide 35
# 
# *Q4*
  fe_0 <- plm(data = nlswork_clean, ln_wage ~ union +
                age +agesq +tenure +tensq +
                not_smsa +south +c_city, model="within", index=c("idcode", "year"))
  re_0 <- plm(data = nlswork_clean, ln_wage ~ union +
                age +agesq +tenure +tensq +
                not_smsa +south +c_city, model="random", index=c("idcode", "year"))
  
  phtest(fe_0, re_0)    
## 
##  Hausman Test
## 
## data:  ln_wage ~ union + age + agesq + tenure + tensq + not_smsa + south +  ...
## chisq = 607.1, df = 8, p-value < 2.2e-16
## alternative hypothesis: one model is inconsistent

BE estimator

#   //Final slide 46
# 
# *Q5*

  be <- plm(data = nlswork_clean, ln_wage ~ union +
              collgrad +age +agesq +tenure +tensq +
              not_smsa +south +c_city, model="between",
            index=c("idcode", "year"))
  
      summary(be)
## Oneway (individual) effect Between Model
## 
## Call:
## plm(formula = ln_wage ~ union + collgrad + age + agesq + tenure + 
##     tensq + not_smsa + south + c_city, data = nlswork_clean, 
##     model = "between", index = c("idcode", "year"))
## 
## Unbalanced Panel: n = 4134, T = 1-12, N = 19007
## Observations used in estimation: 4134
## 
## Residuals:
##       Min.    1st Qu.     Median    3rd Qu.       Max. 
## -1.5946586 -0.2004452 -0.0067495  0.1909616  1.8487578 
## 
## Coefficients:
##                Estimate  Std. Error  t-value  Pr(>|t|)    
## (Intercept)  1.19446539  0.15356828   7.7781 9.240e-15 ***
## union        0.11135322  0.01636297   6.8052 1.154e-11 ***
## collgrad     0.34850931  0.01308193  26.6405 < 2.2e-16 ***
## age          0.02113165  0.01022918   2.0658   0.03891 *  
## agesq       -0.00034316  0.00016368  -2.0965   0.03610 *  
## tenure       0.09570307  0.00539210  17.7488 < 2.2e-16 ***
## tensq       -0.00343998  0.00036430  -9.4426 < 2.2e-16 ***
## not_smsa    -0.20536231  0.01440760 -14.2537 < 2.2e-16 ***
## south       -0.13058886  0.01145804 -11.3971 < 2.2e-16 ***
## c_city      -0.02215011  0.01385878  -1.5983   0.11006    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    773.5
## Residual Sum of Squares: 464.95
## R-Squared:      0.3989
## Adj. R-Squared: 0.39759
## F-statistic: 304.083 on 9 and 4124 DF, p-value: < 2.22e-16

FD estimator

# //Final slide 53
# 
# *Q6*
  fd <- plm(data = nlswork_clean, ln_wage ~ 0 + union +
              collgrad +age +agesq +tenure +tensq +
              not_smsa +south +c_city, model="fd",
            index=c("idcode", "year"))
    
    summary(fd)
## Oneway (individual) effect First-Difference Model
## 
## Call:
## plm(formula = ln_wage ~ 0 + union + collgrad + age + agesq + 
##     tenure + tensq + not_smsa + south + c_city, data = nlswork_clean, 
##     model = "fd", index = c("idcode", "year"))
## 
## Unbalanced Panel: n = 4134, T = 1-12, N = 19007
## Observations used in estimation: 14873
## 
## Residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.9266 -0.0925  0.0073  0.0156  0.1279  3.3217 
## 
## Coefficients:
##             Estimate  Std. Error t-value  Pr(>|t|)    
## union     6.9160e-02  6.6336e-03 10.4257 < 2.2e-16 ***
## age       2.2744e-02  5.5436e-03  4.1027 4.104e-05 ***
## agesq    -2.5853e-04  9.0084e-05 -2.8698  0.004113 ** 
## tenure    3.2078e-02  2.1241e-03 15.1024 < 2.2e-16 ***
## tensq    -1.2023e-03  1.7526e-04 -6.8600 7.160e-12 ***
## not_smsa -7.7277e-02  1.4369e-02 -5.3780 7.645e-08 ***
## south    -4.6889e-02  1.5675e-02 -2.9913  0.002782 ** 
## c_city    2.2987e-02  9.9149e-03  2.3185  0.020437 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    1441.5
## Residual Sum of Squares: 1404.4
## R-Squared:      0.02879
## Adj. R-Squared: 0.028332
## F-statistic: 85.5285 on 8 and 14865 DF, p-value: < 2.22e-16

Output Table

  stargazer(pols,re,fe,be,title = "Regression analysis", 
            model.numbers = FALSE,
            column.labels = c("OLS","RE","FE","BE"),
            label = "regressions",
            table.placement = "!ht",
            notes.append = FALSE,
            notes.align="l",
            notes="Standard errors in parentheses.",
            header = FALSE,
            no.space = TRUE,
            omit = c("Constant"),
            omit.stat = c("adj.rsq","f","ser"),
            digits = 6,
            digits.extra = 7,
            omit.yes.no = c("Constant",""),
            dep.var.caption="",
            dep.var.labels.include = FALSE,
            style = "qje",
            type="text")
## 
## Regression analysis
## ============================================================
##              OLS           RE           FE           BE     
## union    0.112774***  0.103677***  0.093877***  0.111353*** 
##           (0.006774)   (0.006447)   (0.006966)   (0.016363) 
## collgrad 0.350946***  0.369309***               0.348509*** 
##           (0.007149)   (0.012344)                (0.013082) 
## age      0.022481***  0.023031***  0.024259***   0.021132** 
##           (0.004249)   (0.003317)   (0.003447)   (0.010229) 
## agesq    -0.000306*** -0.000249*** -0.000226*** -0.000343** 
##           (0.000068)   (0.000053)   (0.000055)   (0.000164) 
## tenure   0.054787***  0.040771***  0.032966***  0.095703*** 
##           (0.001944)   (0.001583)   (0.001646)   (0.005392) 
## tensq    -0.001540*** -0.001247*** -0.001100*** -0.003440***
##           (0.000125)   (0.000100)   (0.000103)   (0.000364) 
## not_smsa -0.205457*** -0.151136*** -0.093105*** -0.205362***
##           (0.007120)   (0.009380)   (0.012970)   (0.014408) 
## south    -0.140589*** -0.111567*** -0.063222*** -0.130589***
##           (0.005850)   (0.008467)   (0.013279)   (0.011458) 
## c_city   -0.031543***   0.000397     0.011409    -0.022150  
##           (0.006683)   (0.007492)   (0.008896)   (0.013859) 
## N           19,007       19,007       19,007       4,134    
## R2         0.319459     0.349029     0.140054     0.398899  
## ============================================================
## Notes:   Standard errors in parentheses.

FURTHER SPECIFICATION TESTS FOR PANEL DATA

Test for heteroskedasticity within panel data

# H0) The null hypothesis for the Breusch-Pagan test is homoskedasticity

# takes too long to compute

# bptest(data = nlswork_clean, ln_wage ~ union +
#          collgrad +age +agesq +tenure +tensq +
#          not_smsa +south +c_city + factor(idcode), studentize=F)


  bptest(data = nlswork_balanced, ln_wage ~ union +
           collgrad +age +agesq +tenure +tensq +
           not_smsa +south +c_city + factor(idcode), studentize=F)
## 
##  Breusch-Pagan test
## 
## data:  ln_wage ~ union + collgrad + age + agesq + tenure + tensq + not_smsa +     south + c_city + factor(idcode)
## BP = 5.2368, df = 10, p-value = 0.8748

Test of serial correlation within panel data

  • Unobserved effects test <<>> Wooldridge’s test for unobserved individual effects <<>>
  pwtest(data = nlswork_clean, ln_wage ~ union +
           collgrad +age +agesq +tenure +tensq +
           not_smsa +south +c_city)

Locally robust tests for serial correlation or random effects

  • Baltagi and Li AR-RE joint test - balanced panel
  pbsytest(data = nlswork_balanced, ln_wage ~ union +
             collgrad +age +agesq +tenure +tensq +
             not_smsa +south +c_city, test="j")
## 
##  Baltagi and Li AR-RE joint test
## 
## data:  formula
## chisq = 50.736, df = 2, p-value = 9.612e-12
## alternative hypothesis: AR(1) errors or random effects

General serial correlation tests

  • Breusch-Godfrey/Wooldridge test for serial correlation in panel models <<>>
pbgtest(fe, order = 2)
## 
##  Breusch-Godfrey/Wooldridge test for serial correlation in panel models
## 
## data:  ln_wage ~ union + collgrad + age + agesq + tenure + tensq + not_smsa +  ...
## chisq = 181.23, df = 2, p-value < 2.2e-16
## alternative hypothesis: serial correlation in idiosyncratic errors

Wooldridge’s test for serial correlation in FE panels

  pwartest(data = nlswork_balanced, ln_wage ~ union +
             collgrad +age +agesq +tenure +tensq +
             not_smsa +south +c_city)
## 
##  Wooldridge's test for serial correlation in FE panels
## 
## data:  plm.model
## F = 12.205, df1 = 1, df2 = 53, p-value = 0.0009707
## alternative hypothesis: serial correlation

Wooldridge first-difference-based test

  pwfdtest(data = nlswork_balanced, ln_wage ~ union +
             collgrad +age +agesq +tenure +tensq +
             not_smsa +south +c_city)
## 
##  Wooldridge's first-difference test for serial correlation in panels
## 
## data:  plm.model
## F = 8.5778, df1 = 1, df2 = 48, p-value = 0.005192
## alternative hypothesis: serial correlation in differenced errors
  pwfdtest(data = nlswork_balanced, ln_wage ~ union +
             collgrad +age +agesq +tenure +tensq +
             not_smsa +south +c_city, h0="fe")
## 
##  Wooldridge's first-difference test for serial correlation in panels
## 
## data:  plm.model
## F = 2.9964, df1 = 1, df2 = 48, p-value = 0.08988
## alternative hypothesis: serial correlation in original errors

Tests for cross-sectional dependence

  pcdtest(data = nlswork_balanced, ln_wage ~ union +
            collgrad +age +agesq +tenure +tensq +
            not_smsa +south +c_city)
## 
##  Pesaran CD test for cross-sectional dependence in panels
## 
## data:  ln_wage ~ union + collgrad + age + agesq + tenure + tensq + not_smsa +     south + c_city
## z = 1.8975, p-value = 0.05776
## alternative hypothesis: cross-sectional dependence

HIGH DIMENSIONAL FIXED-EFFECTS

## CHECK: 'lfe' and 'fixest'
### https://github.com/sgaure/lfe
### https://github.com/lrberge/fixest

# *including 1 fixed effect*

  HDFE1a <- feols(data = nlswork_clean, ln_wage ~ union +
                    age +agesq +tenure +tensq +
                    not_smsa +south +c_city | idcode)
  
      summary(HDFE1a)
## OLS estimation, Dep. Var.: ln_wage
## Observations: 19,007 
## Fixed-effects: idcode: 4,134
## Standard-errors: Clustered (idcode) 
##           Estimate Std. Error   t value   Pr(>|t|)    
## union     0.093877   0.009566  9.814146  < 2.2e-16 ***
## age       0.024259   0.005008  4.843618 1.3216e-06 ***
## agesq    -0.000226   0.000081 -2.778359 5.4881e-03 ** 
## tenure    0.032966   0.002085 15.807147  < 2.2e-16 ***
## tensq    -0.001100   0.000126 -8.719205  < 2.2e-16 ***
## not_smsa -0.093105   0.019791 -4.704515 2.6279e-06 ***
## south    -0.063222   0.021654 -2.919622 3.5235e-03 ** 
## c_city    0.011409   0.012606  0.905058 3.6549e-01    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.225053     Adj. R2: 0.703151
##                  Within R2: 0.140054
  HDFE1b <- felm(data = nlswork_clean, ln_wage ~ union +
                   age +agesq +tenure +tensq +
                   not_smsa +south +c_city | idcode, clustervar=c("idcode"))
## Warning: Argument(s) clustervar are deprecated and will be removed, use
## multipart formula instead
      summary(HDFE1b)
## 
## Call:
##    felm(formula = ln_wage ~ union + age + agesq + tenure + tensq +      not_smsa + south + c_city | idcode, data = nlswork_clean,      clustervar = c("idcode")) 
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8803 -0.1022  0.0000  0.1077  2.8071 
## 
## Coefficients:
##            Estimate Cluster s.e. t value Pr(>|t|)    
## union     9.388e-02    9.565e-03   9.814  < 2e-16 ***
## age       2.426e-02    5.008e-03   4.844 1.32e-06 ***
## agesq    -2.262e-04    8.141e-05  -2.778  0.00549 ** 
## tenure    3.297e-02    2.086e-03  15.807  < 2e-16 ***
## tensq    -1.100e-03    1.262e-04  -8.719  < 2e-16 ***
## not_smsa -9.310e-02    1.979e-02  -4.705 2.63e-06 ***
## south    -6.322e-02    2.165e-02  -2.920  0.00352 ** 
## c_city    1.141e-02    1.261e-02   0.905  0.36549    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2545 on 14865 degrees of freedom
## Multiple R-squared(full model): 0.7678   Adjusted R-squared: 0.7032 
## Multiple R-squared(proj model): 0.1401   Adjusted R-squared: -0.0995 
## F-statistic(full model, *iid*):11.87 on 4141 and 14865 DF, p-value: < 2.2e-16 
## F-statistic(proj model):   168 on 8 and 4133 DF, p-value: < 2.2e-16
# *including a 2nd fixed effect*

  HDFE2a <- feols(data = nlswork_clean, ln_wage ~ union +
                    age +agesq +tenure +tensq +
                    not_smsa +south +c_city | idcode + year)
  
      summary(HDFE2a)
## OLS estimation, Dep. Var.: ln_wage
## Observations: 19,007 
## Fixed-effects: idcode: 4,134,  year: 12
## Standard-errors: Clustered (idcode) 
##           Estimate Std. Error   t value   Pr(>|t|)    
## union     0.095700   0.009523 10.048910  < 2.2e-16 ***
## age       0.073440   0.013588  5.404711 6.8573e-08 ***
## agesq    -0.000720   0.000116 -6.218794 5.5065e-10 ***
## tenure    0.032423   0.002104 15.408152  < 2.2e-16 ***
## tensq    -0.001090   0.000129 -8.443532  < 2.2e-16 ***
## not_smsa -0.090537   0.019619 -4.614644 4.0571e-06 ***
## south    -0.064281   0.021622 -2.972941 2.9666e-03 ** 
## c_city    0.010432   0.012668  0.823497 4.1027e-01    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.223942     Adj. R2: 0.705857
##                  Within R2: 0.066421
  HDFE2b <- felm(data = nlswork_clean, ln_wage ~ union +
                   age +agesq +tenure +tensq +
                   not_smsa +south +c_city  | idcode + year, clustervar=c("idcode"))
## Warning: Argument(s) clustervar are deprecated and will be removed, use
## multipart formula instead
      summary(HDFE2b)
## 
## Call:
##    felm(formula = ln_wage ~ union + age + agesq + tenure + tensq +      not_smsa + south + c_city | idcode + year, data = nlswork_clean,      clustervar = c("idcode")) 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.90155 -0.09933  0.00000  0.10738  2.78536 
## 
## Coefficients:
##            Estimate Cluster s.e. t value Pr(>|t|)    
## union     0.0956999    0.0095207  10.052  < 2e-16 ***
## age       0.0734400    0.0135842   5.406 6.80e-08 ***
## agesq    -0.0007205    0.0001158  -6.221 5.44e-10 ***
## tenure    0.0324225    0.0021036  15.413  < 2e-16 ***
## tensq    -0.0010902    0.0001291  -8.446  < 2e-16 ***
## not_smsa -0.0905368    0.0196138  -4.616 4.03e-06 ***
## south    -0.0642811    0.0216158  -2.974  0.00296 ** 
## c_city    0.0104319    0.0126641   0.824  0.41014    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2533 on 14854 degrees of freedom
## Multiple R-squared(full model): 0.7701   Adjusted R-squared: 0.7059 
## Multiple R-squared(proj model): 0.06642   Adjusted R-squared: -0.1945 
## F-statistic(full model, *iid*):11.98 on 4152 and 14854 DF, p-value: < 2.2e-16 
## F-statistic(proj model): 62.59 on 8 and 4133 DF, p-value: < 2.2e-16

Exercise with simulated data

See the Stata file ‘stata_do_example.do’ that produces the data in folder tmp_files.

simulated <- read_dta("data/data_simulation.dta")

# names(nlswork)
# head(nlswork)
# str(nlswork)

EDA: Exploratory Data Analysis

# eda_report(simulated,output_dir = "EDA/",output_file = "eda_simulated.pdf")

  ExpData(simulated,type=1)
  ExpData(simulated,type=2)
  summary(simulated)
##     workerid        year            ui               quarter     
##  Min.   :  1   Min.   : 1.0   Min.   :0.0004503   Min.   :1.000  
##  1st Qu.:124   1st Qu.: 3.0   1st Qu.:0.2438383   1st Qu.:2.000  
##  Median :248   Median : 5.5   Median :0.4766747   Median :2.000  
##  Mean   :248   Mean   : 5.5   Mean   :0.4879205   Mean   :2.517  
##  3rd Qu.:372   3rd Qu.: 8.0   3rd Qu.:0.7447072   3rd Qu.:4.000  
##  Max.   :495   Max.   :10.0   Max.   :0.9957856   Max.   :4.000  
##                                                                  
##        q1              wage             educ            exper      
##  Min.   :0.0000   Min.   : 662.1   Min.   : 0.000   Min.   : 0.00  
##  1st Qu.:0.0000   1st Qu.:1226.8   1st Qu.: 2.322   1st Qu.: 9.00  
##  Median :0.0000   Median :1718.1   Median : 4.646   Median :15.00  
##  Mean   :0.2404   Mean   :1998.8   Mean   : 5.226   Mean   :14.34  
##  3rd Qu.:0.0000   3rd Qu.:2482.9   3rd Qu.: 7.635   3rd Qu.:19.00  
##  Max.   :1.0000   Max.   :7170.2   Max.   :19.446   Max.   :28.00  
##                                                                    
##      union            exper2          lnwage           yy1           yy2     
##  Min.   :0.0000   Min.   :  0.0   Min.   :6.495   Min.   :0.0   Min.   :0.0  
##  1st Qu.:0.0000   1st Qu.: 81.0   1st Qu.:7.112   1st Qu.:0.0   1st Qu.:0.0  
##  Median :0.0000   Median :225.0   Median :7.449   Median :0.0   Median :0.0  
##  Mean   :0.4863   Mean   :247.3   Mean   :7.483   Mean   :0.1   Mean   :0.1  
##  3rd Qu.:1.0000   3rd Qu.:361.0   3rd Qu.:7.817   3rd Qu.:0.0   3rd Qu.:0.0  
##  Max.   :1.0000   Max.   :784.0   Max.   :8.878   Max.   :1.0   Max.   :1.0  
##                                                                              
##       yy3           yy4           yy5           yy6           yy7     
##  Min.   :0.0   Min.   :0.0   Min.   :0.0   Min.   :0.0   Min.   :0.0  
##  1st Qu.:0.0   1st Qu.:0.0   1st Qu.:0.0   1st Qu.:0.0   1st Qu.:0.0  
##  Median :0.0   Median :0.0   Median :0.0   Median :0.0   Median :0.0  
##  Mean   :0.1   Mean   :0.1   Mean   :0.1   Mean   :0.1   Mean   :0.1  
##  3rd Qu.:0.0   3rd Qu.:0.0   3rd Qu.:0.0   3rd Qu.:0.0   3rd Qu.:0.0  
##  Max.   :1.0   Max.   :1.0   Max.   :1.0   Max.   :1.0   Max.   :1.0  
##                                                                       
##       yy8           yy9           yy10       lag_lnwage   
##  Min.   :0.0   Min.   :0.0   Min.   :0.0   Min.   :6.495  
##  1st Qu.:0.0   1st Qu.:0.0   1st Qu.:0.0   1st Qu.:7.085  
##  Median :0.0   Median :0.0   Median :0.0   Median :7.425  
##  Mean   :0.1   Mean   :0.1   Mean   :0.1   Mean   :7.455  
##  3rd Qu.:0.0   3rd Qu.:0.0   3rd Qu.:0.0   3rd Qu.:7.784  
##  Max.   :1.0   Max.   :1.0   Max.   :1.0   Max.   :8.724  
##                                            NA's   :495
  ftable(simulated$year)
##    1   2   3   4   5   6   7   8   9  10
##                                         
##  495 495 495 495 495 495 495 495 495 495
  ExpCTable(simulated)
  ExpCatViz(simulated)
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

TRY IN A ‘JUPYTER NOTEBOOK’: ExpNumViz(nlswork)

  ExpNumStat(simulated,by="A",Outlier = TRUE,round=2,Qnt=c(0.1,0.25,0.50,0.99))
  ExpNumViz(simulated,Page=c(6,2))
## $`0`

  ExpOutliers(simulated,varlist=c("lnwage"))
## $outlier_summary
##              Category           lnwage
## 1    Lower cap : 0.05 6.75808242272123
## 2    Upper cap : 0.95 8.32169651894011
## 3         Lower bound             6.05
## 4         Upper bound             8.87
## 5     Num of outliers                1
## 6  Lower outlier case                 
## 7  Upper outlier case              210
## 8         Mean before             7.48
## 9          Mean after             7.48
## 10      Median before 7.44898780138622
## 11       Median after 7.44896911785069
## 
## $outlier_data
##       workerid year        ui quarter q1     wage      educ exper union exper2
##    1:        1    1 0.5734584       2  0 1913.481 6.3080420    17     1    289
##    2:        1    2 0.5734584       2  0 2065.687 6.3080420    18     1    324
##    3:        1    3 0.5734584       2  0 2015.642 7.3080420    19     1    361
##    4:        1    4 0.5734584       2  0 2274.630 8.3080425    20     1    400
##    5:        1    5 0.5734584       2  0 2576.639 9.3080425    21     1    441
##   ---                                                                         
## 4946:      495    6 0.3515452       1  1 1257.155 0.0000000    17     1    289
## 4947:      495    7 0.3515452       1  1 1170.614 0.3515451    18     1    324
## 4948:      495    8 0.3515452       1  1 1211.788 0.3515451    19     1    361
## 4949:      495    9 0.3515452       1  1 1346.442 0.3515451    20     1    400
## 4950:      495   10 0.3515452       1  1 1279.959 0.3515451    21     0    441
##         lnwage yy1 yy2 yy3 yy4 yy5 yy6 yy7 yy8 yy9 yy10 lag_lnwage
##    1: 7.556680   1   0   0   0   0   0   0   0   0    0         NA
##    2: 7.633218   0   1   0   0   0   0   0   0   0    0   7.556680
##    3: 7.608693   0   0   1   0   0   0   0   0   0    0   7.633218
##    4: 7.729573   0   0   0   1   0   0   0   0   0    0   7.608693
##    5: 7.854241   0   0   0   0   1   0   0   0   0    0   7.729573
##   ---                                                             
## 4946: 7.136607   0   0   0   0   0   1   0   0   0    0   6.973784
## 4947: 7.065284   0   0   0   0   0   0   1   0   0    0   7.136607
## 4948: 7.099852   0   0   0   0   0   0   0   1   0    0   7.065284
## 4949: 7.205221   0   0   0   0   0   0   0   0   1    0   7.099852
## 4950: 7.154584   0   0   0   0   0   0   0   0   0    1   7.205221
##       out_cap_lnwage
##    1:       7.556680
##    2:       7.633218
##    3:       7.608693
##    4:       7.729573
##    5:       7.854241
##   ---               
## 4946:       7.136607
## 4947:       7.065284
## 4948:       7.099852
## 4949:       7.205221
## 4950:       7.154584
## 
## $outlier_index
## $outlier_index$upper_out_index
## $outlier_index$upper_out_index[[1]]
## [1] 210
## 
## 
## $outlier_index$lower_out_index
## $outlier_index$lower_out_index[[1]]
## numeric(0)
  vis_dat(simulated)

# gg_miss_upset(simulated)

  stargazer(simulated,
            title = "Summary statistics",
            label = "tb:statistcis",
            table.placement = "ht",
            header=FALSE,type="text")
## 
## Summary statistics
## ======================================================
## Statistic    N     Mean    St. Dev.    Min      Max   
## ------------------------------------------------------
## workerid   4,950  248.000   142.908     1       495   
## year       4,950   5.500     2.873      1       10    
## ui         4,950   0.488     0.289   0.0005    0.996  
## quarter    4,950   2.517     1.128      1        4    
## q1         4,950   0.240     0.427      0        1    
## wage       4,950 1,998.779 1,032.194 662.083 7,170.242
## educ       4,950   5.226     3.795    0.000   19.446  
## exper      4,950  14.336     6.460      0       28    
## union      4,950   0.486     0.500      0        1    
## exper2     4,950  247.252   187.126     0       784   
## lnwage     4,950   7.483     0.476    6.495    8.878  
## yy1        4,950   0.100     0.300      0        1    
## yy2        4,950   0.100     0.300      0        1    
## yy3        4,950   0.100     0.300      0        1    
## yy4        4,950   0.100     0.300      0        1    
## yy5        4,950   0.100     0.300      0        1    
## yy6        4,950   0.100     0.300      0        1    
## yy7        4,950   0.100     0.300      0        1    
## yy8        4,950   0.100     0.300      0        1    
## yy9        4,950   0.100     0.300      0        1    
## yy10       4,950   0.100     0.300      0        1    
## lag_lnwage 4,455   7.455     0.470    6.495    8.724  
## ------------------------------------------------------
## DASHBOARD

  ####### ExPanD()

Regressions

OBSERVE THE MISTAKE FOLLOWING THE INTRODUCTION OF TIME DUMMIES AND EXPERIENCE IN THE FIXED-EFFECTS MODEL

  pols <- plm(data = simulated, lnwage ~ educ + exper + 
              exper2 + factor(year), 
            model="pooling", index=c("workerid", "year"))

  re <- plm(data = simulated, lnwage ~ educ + exper + 
            exper2 + factor(year), 
          model="random", index=c("workerid", "year"))

  plmtest(pols, type=c("bp"))
## 
##  Lagrange Multiplier Test - (Breusch-Pagan)
## 
## data:  lnwage ~ educ + exper + exper2 + factor(year)
## chisq = 19885, df = 1, p-value < 2.2e-16
## alternative hypothesis: significant effects
  fe <- plm(data = simulated, lnwage ~ educ + exper + 
              exper2 + factor(year), 
            model="within", index=c("workerid", "year"))
  
  pFtest(fe, pols)
## 
##  F test for individual effects
## 
## data:  lnwage ~ educ + exper + exper2 + factor(year)
## F = 270.97, df1 = 493, df2 = 4444, p-value < 2.2e-16
## alternative hypothesis: significant effects
  phtest(fe, re)
## 
##  Hausman Test
## 
## data:  lnwage ~ educ + exper + exper2 + factor(year)
## chisq = 309.92, df = 11, p-value < 2.2e-16
## alternative hypothesis: one model is inconsistent
  pols_robust <- coeftest(pols, function(x) vcovHC(x, type = 'sss')) 
  re_robust <- coeftest(re, function(x) vcovHC(x, type = 'sss')) 
  fe_robust <- coeftest(fe, function(x) vcovHC(x, type = 'sss')) 
  
  stargazer(pols_robust,re_robust, fe_robust,title = "Regression analysis", 
            model.numbers = FALSE,
            column.labels = c("Pooled (cluster)","RE (cluster)","FE (cluster"),
            label = "regressions",
            table.placement = "!ht",
            notes.append = FALSE,
            notes.align="l",
            notes="Standard errors in parentheses.",
            header = FALSE,
            no.space = TRUE,
            omit = c("Constant"),
            omit.stat = c("adj.rsq","f","ser"),
            digits = 3,
            digits.extra = 5,
            omit.yes.no = c("Constant",""),
            dep.var.caption="",
            dep.var.labels.include = FALSE,
            style = "qje",
            type="text")
## 
## Regression analysis
## ========================================================
##                Pooled (cluster) RE (cluster) FE (cluster
## educ               0.107***       0.067***    0.062***  
##                    (0.002)        (0.001)      (0.001)  
## exper              0.013***       0.005**     0.025***  
##                    (0.005)        (0.002)      (0.001)  
## exper2             -0.0003*       0.000004    0.000004  
##                    (0.0002)      (0.00002)    (0.00002) 
## factor(year)2       -0.001        0.019***     0.0002   
##                    (0.004)        (0.004)      (0.003)  
## factor(year)3       -0.002        0.038***      0.001   
##                    (0.005)        (0.005)      (0.003)  
## factor(year)4       -0.008        0.052***     -0.004*  
##                    (0.007)        (0.007)      (0.003)  
## factor(year)5       -0.004        0.075***     -0.0002  
##                    (0.009)        (0.009)      (0.002)  
## factor(year)6       -0.005        0.095***      0.001   
##                    (0.010)        (0.011)      (0.003)  
## factor(year)7       -0.009        0.112***     -0.001   
##                    (0.012)        (0.013)      (0.003)  
## factor(year)8       -0.011        0.131***     0.00004  
##                    (0.014)        (0.015)      (0.003)  
## factor(year)9       -0.013        0.148***     -0.002   
##                    (0.016)        (0.017)      (0.003)  
## factor(year)10      -0.011        0.169***              
##                    (0.018)        (0.019)               
## ========================================================
## Notes:         Standard errors in parentheses.

Close the log file

end_time <- Sys.time()

end_time - start_time
## Time difference of 26.96878 secs
  # sprintf(end_time - start_time,fmt = '%#.1f')
#